clear all
capture set more off

*******************************************************************************************************************************************
// Admin
*******************************************************************************************************************************************
/*
	// Set directory path
	foreach i in "C:/Users/Andy/Dropbox/projects/WWI German Discrimination"		///
				 "C:/Users/anf137/Dropbox/projects/WWI German Discrimination"	///
				 "Z:/Dropbox/projects/WWI German Discrimination"				{
		
				global path "`i'"
				confirmdir "$path"
				if `r(confirmdir)'==0 continue, break
	}
*/

*******************************************************************************************************************************************
// Merge linked Census data with casualty, geographic, and aggregate Census controls and generate relevant variables
*******************************************************************************************************************************************

	* load Census data
		use "$path/Replication/cleaned_data/americans1910-20_linked.dta"
		
		* keep men of working age in the labor force
		keep if age>15 & age<66
		
		drop if labforce==1
		
		duplicates tag histid_1910 year, gen(tag10)
		duplicates tag histid_1920 year, gen(tag20)
		drop if tag10==1 | tag20==1
		drop tag10 tag20
		
		* keep those who have a full panel (2 periods)
		bysort id: gen n = _n
		egen maxn = max(n), by(id)
		keep if maxn==2
		drop n maxn
		
		gen county = countyicp
		*drop stcounty
		gen stcounty = 10000*statefip + county
		
		
		

	* fix county codes to 1910 county codes
		replace stcounty = stcounty/10
		ren stcounty fips

		qui do "$path/Replication/code/standardize_county_boundaries_to_1910.do"  

	* merge with the WWI casualty information by county
		merge m:1 fips using "$path/Replication/cleaned_data/WWI casualties Army Navy clean - county.dta"
		keep if _merge==3
		drop _merge

	* merge with the WWI draft information by county
		gen statefips = statefip
		merge m:1 statefips county using "$path/Replication/cleaned_data/WWI inductions by county - clean.dta" 
		drop if _merge==2
		drop _merge
		
	* merge with latitude and longitude data
		gen icpsrfip = fips
		merge m:1 icpsrfip county using "$path/Replication/raw_data/county_lat_lon.dta"
		drop if _merge==2
		drop _merge

		
	* merge with granular age distribution info in 1910
		merge m:1 fips using "$path/Replication/raw_data/age_distribution.dta"
		drop if _merge==2
		drop _merge
		
		egen age18_45_amm = rowtotal(am_10- am_37)
		

		gen casrate2 = dead_all/age18_45_amm*100 if year==1910
		egen casualtyrate2 = max(casrate2), by(id)
		replace casualtyrate2 = 0 if year==1910
		
		xtile k2 = casualtyrate2, nq(5)
		gen treat2 = (k2==5)*(year==1920)
		
		
		gen drate2 = tot_accepted_num/age18_45_amm*100 if year==1910
		replace drate2 = 100 if drate2>100 & drate!=.
		egen draftrate2 = max(drate2), by(id)
		replace draftrate2 = 0 if year==1910
		drop k2 drate2 all_* am_*
		
		ren (treat2 casualtyrate2 draftrate2) (treat casualtyrate draftrate)

		

	// dummy for whether individual moved county between 1910 and 1920
	egen moved = sd(fips), by(id)

	cap drop mover
	gen mover = moved!=0

	replace mover = 0 if year==1910


	// define skill groups and label them for convenience
	gen skill =  1*(occ1950<100) 				+ /// 
				 2*(occ1950>=100 & occ1950<200)	+ ///
				 3*(occ1950>=200 & occ1950<300)	+ ///
				 4*(occ1950>=300 & occ1950<400)	+ ///
				 5*(occ1950>=400 & occ1950<500)	+ ///
				 6*(occ1950>=500 & occ1950<600)	+ ///
				 7*(occ1950>=600 & occ1950<700)	+ ///
				 8*(occ1950>=700 & occ1950<800)	+ ///
				 9*(occ1950>=800 & occ1950<900)	+ ///
				10*(occ1950>=900 & occ1950<979) + ///
				11*(occ1950>=979 & occ1950!=.)

	label def skillab 1  "Professional, Technical"			///
					  2  "Farmers"				  			///
					  3  "Managers, Officials, Proprietors"	///
					  4  "Clerical and Kindred"				///
					  5  "Sales workers"					///
					  6  "Craftsmen"						///
					  7  "Operatives"						///
					  8  "Service Workers (priv househ)"	///
					  9  "Farm Laborers"					///
					  10 "Laborers"							///
					  11 "Non-occupational response"

	label val skill skillab


	// define industry groups and label them for convenience
	gen industry =  1*(ind1950>=100 & ind1950<127) + ///
					2*(ind1950>=127 & ind1950<240) + ///
					3*(ind1950>=240 & ind1950<300) + ///
					4*(ind1950>=300 & ind1950<400) + ///
					5*(ind1950>=400 & ind1950<500) + ///
					6*(ind1950>=500 & ind1950<569) + ///
					7*(ind1950>=569 & ind1950<580) + ///
					8*(ind1950>=580 & ind1950<600) + ///
					9*(ind1950>=600 & ind1950<628) + ///
				   10*(ind1950>=628 & ind1950<700) + ///
				   11*(ind1950>=700 & ind1950<757) + ///
				   12*(ind1950>=757 & ind1950<818) + ///
				   13*(ind1950>=818 & ind1950<850) + ///
				   14*(ind1950>=850 & ind1950<860) + ///
				   15*(ind1950>=860 & ind1950<900) + ///
				   16*(ind1950>=900 & ind1950<947) + ///
				   17*(ind1950>=947 & ind1950<979) + ///
				   18*(ind1950>=979 & ind1950!=.)

	label def indlab 0  "N/A or none reported"				///
					 1  "Agriculture, Forestry, Fishing"	///
					 2  "Mining"							///
					 3  "Construction"						///
					 4  "Manufacturing (dur.)"				///
					 5  "Manufacturing (nondur.)"			///
					 6  "Transportation"					///
					 7  "Telecommunications"				///
					 8  "Utilities, Sanitary Services"		///
					 9  "Wholesale Trade"					///
					 10 "Retail Trade"						///
					 11 "Finance, Insurance, Real Estate"	///
					 12 "Business and Repair Services"		///
					 13 "Personal services"					///
					 14 "Entertainment/Recreation Services"	///
					 15 "Professional and Related Services"	///
					 16 "Public Administration"				///
					 17 "Not yet specified"					///
					 18 "Others"

	label val industry indlab
					 
					 
	// generate 1910 baseline characteristics
	qui foreach var in urban skill industry farm  classwkr lit marst yrsusa2 statefip empstat famsize wksunemp school labforce fips {
		
		gen `var'_temp = `var' if year==1910
		egen `var'1910 = max(`var'_temp), by(id)
		replace `var'1910 = 0 if year==1910
		label val `var'1910 `var'_lbl
		drop `var'_temp
	}

	// define baseline categories for 1910 variables interacted with 1920 dummy
	replace urban1910    = 0 if urban1910==1	 // rural
	replace skill1910    = 0 if skill1910==11	 // non-occ response
	replace farm1910     = 0 if farm1910==1		 // non-farm
	replace empstat1910  = 0 if empstat1910==30	 // not in labor force
	replace lit1910 	 = 0 if lit1910==1		 // illiterate
	replace marst1910	 = 0 if marst1910==6	 // single
	replace yrsusa21910  = 0 if yrsusa21910==5	 // 21+ yrs
	replace statefip1910 = 0 if statefip1910==48 // Texas
	replace fips1910 	 = 0 if fips1910==42003	 // Allegheny county, Pennsylvania
	replace school1910 	 = 0 if school1910==1	 // not in school
	replace labforce1910 = 0 if labforce1910==1	 // not in labor force

	label val industry1910 indlab
	label val skill1910 skillab


	gen nonat = citizen==3 | citizen==4
	gen noeng = speakeng==1
	gen low = skill==10 | skill==9 | skill==8

	egen agri10 = max(industry1910==1), by(id)
	gen farmer = industry==1 & year==1920 & agri10==0

	gen down = 0 if year==1920
	replace down = 1 if skill1910==1 & skill>1 & year==1920
	replace down = 1 if (skill1910==3 | skill1910==4) & (skill>4 | skill==2) & year==1920
	replace down = 1 if (skill1910==5 | skill1910==6 | skill1910==7) & (skill>7 | skill==2) & year==1920
	replace down = 0 if down==. & year==1910

	gen south = statename=="Alabama"  	| statename=="Arkansas" 		| statename=="Delaware"  		| statename=="Florida" 		| ///
				statename=="Georgia"  	| statename=="Kentucky" 		| statename=="Louisiana" 		| statename=="Mississippi" 	| ///
				statename=="Missouri" 	| statename=="North Carolina" 	| statename=="South Carolina" 	| statename=="Tennessee" 	| ///
				statename=="Texas" 		| statename=="Virginia"

	gen south1920 = south*(year==1920)
				
	gen neweng = statename=="Maine"  	| statename=="Connecticut"		| statename=="New Hampshire"  	| statename=="Rhode Island" | ///
				 statename=="Vermont"  	| statename=="Massachusetts"

	gen neweng1920 = neweng*(year==1920)

	gen midwest = statename=="Ohio" | statename=="Indiana" | statename=="Michigan" | statename=="Illinois" | statename=="Wisconsin" | statename=="Missouri" | statename=="Iowa" | statename=="Minnesota" | statename=="North Dakota" | statename=="South Dakota" | statename=="Nebraska" | statename=="Kansas"
	gen notmidwest = midwest==0
	gen notmidwest20 = notmidwest==1 & year==1920

	* industry dummies in 1910 for agriculture, manufacturing, retail
	egen maxagri = max(industry1910==1), by(id)
	egen maxmfg = max(industry1910==4 | industry1910==5), by(id)
	egen maxret = max(industry1910==10), by(id)



	qui tab yrsusa2, gen(Dyrsusa2)
	*gen school = school==2
	gen atwork = empstat==1
	gen lfyes = labforce==2
	gen literate = lit==4
	gen married = marst==1
	gen farmst = farm==2
	gen speaks = speakeng==2
	gen mfg = industry==4 | industry==5
	gen owns = ownershp==1
	gen readwrite = lit==4
	gen urbst = urban==2
	gen employer = classwkr==2
	gen naturalized = citizen==2 | citizen==4

	* generate sample dummy

	merge m:1 fips using "$path/Replication/cleaned_data/outflow_from_cnty_cens.dta"
	keep if _merge==3
	drop _merge

	merge m:1 statefip sex age race occ1950 ind1950 using "$path/Replication/raw_data/lido_score_1950_public_use.dta"
	drop if _merge==2
	drop _merge

	gen lnoccscore = ln(occscore)
	gen lnlido = ln(lido)
	*keep if lnlido!=.

	replace outflow = 0 if year==1910
	*drop if skill1910==0


	global controls "c.draftrate i.urban1910 i.skill1910 i.farm1910 i.empstat1910 i.lit1910 i.marst1910 c.famsize1910 i.school1910 i.labforce1910 c.wksunemp1910"


	* works
	*replace lnoccscore = 0 if lnoccscore==.
	*replace lnlido = 0 if lnlido==.

	egen maxmover = max(mover), by(id)
	keep if maxmover==0

	gen mgr = skill==1 | skill==3
	gen op = skill==7
	gen craft = skill==6

	keep if occ1950<=970 & ind1950<=970

	gen lnoccscore1 = ln(1+occscore)
	gen lnlido1 = ln(1+lido)

	egen max2 = max(skill==2), by(id)
	drop if max2==1
		
	* keep those who have a full panel (2 periods)
	bysort id: gen n = _n
	egen maxn = max(n), by(id)
	keep if maxn==2
	drop n maxn

	drop if lnlido==.

	
	compress
	save "$path/Replication/cleaned_data/linked_Americans_estimation_sample.dta", replace
	clear